#imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from collections import defaultdict, Counter
import json
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
import helpers
%pylab inline
#Load zip to fips file, and cast as default dict to avoid error when querying unknown zip
#Sourced from: https://raw.githubusercontent.com/bgruber/zip2fips/master/zip2fips.json
zips2fips = json.load(open('zip2fips-master/zip2fips.json','r'))
zips2fips = dict(zips2fips) #defaultdict(int,zips2fips)
#Load weather data file
daily_weather_df = pd.read_csv('daily_weather.csv',index_col=0)
With any new data science project, the first step is understanding what the data looks like, including the relevant fields and what a typical record looks like.
daily_weather_df.head()
print('There are {} records'.format(daily_weather_df.shape[0]))
#Find the dates which the weather file covers
print('First day: {}'.format(daily_weather_df['date'].min()))
print('Last day: {}'.format(daily_weather_df['date'].max()))
#Extract list of weather feature names
weather_features = [c for c in daily_weather_df.columns if c.startswith('met')]
print('{} weather features per day\n'.format(len(weather_features)))
print(', '.join(weather_features))
Below I plot CDFs of the range of values for each weather features. The vertical dotted lines mark the 5th and 95th percentile. These CDFs give us a sense for the range and variance of the value for each feature as well as the number and extremeness of any outlying values.
for weather_feature in weather_features:
helpers.plot_weather_feature_values(weather_feature, daily_weather_df )
plt.show()
#Now let's look at the wheat data
wheat_2014_df = pd.read_csv('gpal/2014-Survey Data-Wheat .csv')
wheat_2015_df = pd.read_csv('gpal/2015-Survey Data-Wheat .csv')
wheat_2016_df = pd.read_csv('gpal/2016 Survey Data-Wheat .csv')
wheat_2014_df.head()
wheat_2015_df.head()
wheat_2016_df.head()
seen_fips = set(wheat_2014_df['zip_code'].unique())
seen_fips.update(set(wheat_2015_df['zip_code'].unique()))
seen_fips.update(set(wheat_2016_df['zip_code'].unique()))
print('There are {} zip codes represented in the data'.format(len(seen_fips)))
target_cols = ['protein_12','actual_wheat_ash','falling_no']
The processing step removes rows that we have unknown zip code to fip value, or unknown weather data for that zip code. It also casts the year to int, removes rows that aren't needed for this analysis and removes rows with nan values.
wheat_2014_df = helpers.process_wheat_data(wheat_2014_df)
wheat_2015_df = helpers.process_wheat_data(wheat_2015_df)
wheat_2016_df = helpers.process_wheat_data(wheat_2016_df)
wheat_2016_df.head()
To get a better sense of the distribution of the values for each of the target features, I plot a CDF as well as a vertical line at the values corresponding to the 5th and 95th percentile of the data. This allows us to see where most of the data lies and what outliers look like and how common they are.
In addition to CDFs I plot each wheather feature vs wheat quality feature. These plots can give a general sense of which wheater is correlated which what wheat quality features, and how.
wheat_df = wheat_2014_df.append(wheat_2015_df).append(wheat_2016_df)
for target_col in target_cols:
helpers.plot_target_values_cdf(target_col,wheat_df)
plt.show()
We are interested in understanding the correlation between the weather features and the grain quality features. However, for each grain quality record (year/fips) we have a daily recording for each weather features. This is 10 features per day for a year, or 3650 features. Because there are so many features (many more than samples), I aggregate the wetaher features on a monthly window - so each weather feature gets 12 "readings" (just the average of the 30 readings in the month).
So for example, instead of a daily met_max_t for each fips, I use a monthly met_max_t (so met_max_t_0 is the average for January, met_max_t_2 is the average met_max_t for February etc).
wheat_2014_df = helpers.get_monthly_weather_features(wheat_2014_df,daily_weather_df,2014,weather_features)
wheat_2015_df = helpers.get_monthly_weather_features(wheat_2015_df,daily_weather_df,2015,weather_features)
wheat_2016_df = helpers.get_monthly_weather_features(wheat_2016_df,daily_weather_df,2016,weather_features)
for feat in weather_features:
feat_cols = [c for c in wheat_2014_df.columns if c.startswith(feat)]
fig, ax = plt.subplots(1,3,figsize=(15,6))
for i,target_col in enumerate(target_cols):
for fc in feat_cols:
ax[i].scatter(wheat_2014_df[target_col].values, wheat_2014_df[fc].values)
ax[i].set_title('{} v {}'.format(target_col, feat))
ax[i].set_xlabel(target_col)
ax[i].set_ylabel('{} (agg monthly) in {}'.format(feat,2014))
for feat in weather_features:
feat_cols = [c for c in wheat_2015_df.columns if c.startswith(feat)]
fig, ax = plt.subplots(1,3,figsize=(15,6))
for i,target_col in enumerate(target_cols):
for fc in feat_cols:
ax[i].scatter(wheat_2015_df[target_col].values, wheat_2015_df[fc].values)
ax[i].set_title('{} v {}'.format(target_col, feat))
ax[i].set_xlabel(target_col)
ax[i].set_ylabel('{} (agg monthly) in {}'.format(feat,2015))
for feat in weather_features:
feat_cols = [c for c in wheat_2016_df.columns if c.startswith(feat)]
fig, ax = plt.subplots(1,3,figsize=(15,6))
for i,target_col in enumerate(target_cols):
for fc in feat_cols:
ax[i].scatter(wheat_2016_df[target_col].values, wheat_2016_df[fc].values)
ax[i].set_title('{} v {}'.format(target_col, feat))
ax[i].set_xlabel(target_col)
ax[i].set_ylabel('{} (agg monthly) in {}'.format(feat,2016))
No weather feature is obviously highly correlated with any grain quality feature. However, often these relationships are complex.
The next step is do apply a modelling technique to extract the most important features. I'll use linear regression and a random forest.
Linear regression is usually a good place to start because it is simple and well understood. The coefficients of the features correspond to the "most important features". I also try using a random forest.
First, collect the sample values and the features. I also scale the features to have mean 0 and variance 1 to reduce the effect of weather features with highly different magnitudes. While this isn't strictly necessary for linear regression, it is good for lasso and other machine learning methods.
feat_cols = []
for feat in weather_features:
feat_cols.extend([c for c in wheat_2014_df.columns if c.startswith(feat)])
year = 2014
target_col = 'protein_12'
wheat_2014_df = wheat_2014_df.dropna()
x = wheat_2014_df[feat_cols].values
y = wheat_2014_df['protein_12'].values
y = np.asarray(y)
x = np.asarray(x)
x_scaled = StandardScaler().fit_transform(x)
model = LinearRegression()
model.fit(x_scaled,y)
print('{} samples, {} features'.format(x_scaled.shape[0], x_scaled.shape[1]))
It is better to have a lot more samples than features, so I'm going to aggregate the 2014, 2015 and 2016 data
wheat_df = wheat_2014_df.append(wheat_2015_df).append(wheat_2016_df)
wheat_df = wheat_df.dropna()
feat_cols = []
for feat in weather_features:
feat_cols.extend([c for c in wheat_2014_df.columns if c.startswith(feat)])
for target_col in target_cols:
x = wheat_df[feat_cols].values
y = wheat_df[target_col].values
y = np.asarray(y)
x = np.asarray(x)
#Scale data to mean 0 and variance 1
x_scaled = StandardScaler().fit_transform(x)
#Linear regression
model = LinearRegression()
model.fit(x_scaled,y)
fig, ax = plt.subplots()
abs_coef = np.abs(model.coef_[:N])
N = 15
ind = np.arange(N)
ax.barh(ind, sorted(abs_coef), width)
ax.set_yticks(ind+width/10)
labels = [feat_cols[i] for i in np.argsort(abs_coef)]
ax.set_yticklabels(labels)
plt.title('Top {} Feature Importances for {}'.format(N,target_col))
plt.xlabel('Relative importance')
plt.ylabel('Weather Feature')
This is telling us that the tempurate has the highest effect on all three target features.
From the plots I made above, it seems like the relationships between the weather features and the grain quality features are not linear. Random forest is a simple model that can find non-linear relationships in data. It also has the benefit of learning the most important features.
for target_col in target_cols:
x = wheat_df[feat_cols].values
y = wheat_df[target_col].values
y = np.asarray(y)
x = np.asarray(x)
model = RandomForestRegressor()
model.fit(x,y)
fig, ax = plt.subplots()
N = 15
ind = np.arange(N)
ax.barh(ind, sorted(model.feature_importances_[:N]), width)
ax.set_yticks(ind+width/10)
labels = [feat_cols[i] for i in np.argsort(model.feature_importances_[:N])]
ax.set_yticklabels(labels)
plt.title('Top {} Feature Importances for {}'.format(N,target_col))
plt.xlabel('Relative importance')
plt.ylabel('Weather Feature')
With more time:
I would do a more thorough exploration of the data. For example: How many nan values do we have? Which zip codes have the most and the least data? Where do the outliers occur?
I would also do a gridsearch over the parameters of the linear regression and random forest models to find the best parameters. This would require some sort of cross-validation too.
Explore other features. In this exercise, the features I used were the weather features for the year of the grain data aggregated by month. I would want to try the features aggregated by week or season. I imagine that the previous year's data could be important too.
Explore binning the continuous wheat quality features into discrete bins (e.g. low, medium, high)